library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(sf)
## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
library(raster)
## Loading required package: sp
##
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:tidyr':
##
## extract
# Load the merged Satellite image for Kazanlak Valley, Bulgaria
kaz <-brick("../1_Teaching/cds-spatial/data/Kaz.tif")
plotRGB(kaz, stretch= "lin")
crs(kaz)
## CRS arguments:
## +proj=utm +zone=35 +datum=WGS84 +units=m +no_defs
# Load prediction data as points (left bottom corner of the evaluated cell)
cnne_df <- read_csv("2021-10-25.predictions/results/east/east.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## x = col_double(),
## y = col_double(),
## mound_probability = col_double()
## )
# 15334 points (origins in the raster cells)
cnnw_df <- read_csv("2021-10-25.predictions/results/west/west.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## x = col_double(),
## y = col_double(),
## mound_probability = col_double()
## )
# 15334 points (origins in the raster cells)
cnn_df <- rbind(cnne_df, cnnw_df)
# 30504 rows
# Check the probabilities of there being a mound in a given cell
hist(cnn_df$mound_probability,
main = "Probability of mound present in a satellite image gridcell");abline(v=0.6, col="red", lty = 2, lwd = 3)
table(cut(cnn_df$mound_probability, c(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)))
##
## (0,0.1] (0.1,0.2] (0.2,0.3] (0.3,0.4] (0.4,0.5] (0.5,0.6] (0.6,0.7] (0.7,0.8]
## 7865 7710 6983 4757 2243 653 195 58
## (0.8,0.9] (0.9,1]
## 30 10
# ok, so there is an exponential drop-off.
### Build a grid of those with 60%+ probability of containing a mound
# Build a grid from all points
#cnnall_sp <- st_as_sf(cnn_df, coords = c("x","y"), crs = 32635)
#cnnall_grid <- st_make_grid(cnnall_sp, cellsize = 250, what = "polygons")
# Filter predictions to those that have 60+% likelihood of containing a mound
cnn60_sp <- cnn_df %>%
filter(mound_probability > 0.59) %>% #333 observations
st_as_sf(coords = c("x","y"), crs = 32635)
# Make a grid of 60%+ cells
cnn_grid <- st_make_grid(cnn60_sp, cellsize = 250, what = "polygons")
plot(cnn_grid)
# Add probability data to the 60%+ grid
#cnnall_datagrid <- st_join(st_sf(cnnall_grid), cnnall_sp)
cnn_grid <- st_join(st_sf(cnn_grid), cnn60_sp)
# Visualize the grid cells with higher probability
ggplot(cnn_grid) +
geom_sf(aes(color = mound_probability))
# 333 raster cells are predicted to contain mounds with greater
# than 60% likelihood. Archaeologists found 773 mounds in fraction of the area
# View the grid
plotRGB(kaz, stretch= "lin");
plot(cnn_grid$geometry, add = TRUE, border = "white")
that we will need later for validation
# Bring in survey area to see overall coverage
survey <- st_read("../1_Teaching/cds-spatial/data/KAZ_surveyarea.shp")
## Reading layer `KAZ_surveyarea' from data source `C:\Users\Adela\Documents\RStudio\1_Teaching\cds-spatial\data\KAZ_surveyarea.shp' using driver `ESRI Shapefile'
## Simple feature collection with 98 features and 5 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 352181.7 ymin: 4712923 xmax: 368890.2 ymax: 4733359
## projected CRS: WGS 84 / UTM zone 35N
plot(survey$geometry)
# Bring in all the mounds observed in the field
mounds <- st_read("../1_Teaching/cds-spatial/data/KAZ_mounds.shp")
## Reading layer `KAZ_mounds' from data source `C:\Users\Adela\Documents\RStudio\1_Teaching\cds-spatial\data\KAZ_mounds.shp' using driver `ESRI Shapefile'
## Simple feature collection with 773 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 352481.3 ymin: 4712325 xmax: 371282.4 ymax: 4730029
## projected CRS: WGS 84 / UTM zone 35N
plot(mounds)
# Extrapolate rough study area after subtracting outliers
far <- mounds %>%
st_is_within_distance(survey, 1500) %>%
lengths>0
farmounds <- which(far==0) # these are the mounds that are mostly far from survey area
# convex hull of points without outliers
survey_sm <- st_convex_hull(st_union(mounds$geometry[-farmounds]))
# convex hull of survey polygons
survey_ch <- st_convex_hull(st_union(survey$geometry))
# quickly see the difference in bounding boxes
plot(survey_ch, col = "green");plot(survey_sm, col = "red", alpha=0.5, add = TRUE); plot(survey$geometry, add =TRUE)
# View all mounds
plotRGB(kaz, stretch= "lin");
plot(survey_ch, col = "green", add = TRUE);
plot(survey_sm, col = "red", add = TRUE);
plot(survey$geometry, col = "lightyellow", add = TRUE );
plot(mounds$geometry[-farmounds,], add = TRUE, col = "pink");
plot(cnn_grid$geometry, add = TRUE, border = "white")
Area surveyed in 2009-2011 is ca 85 sq km and contains 773 documented mounds of various shapes and sizes. This area contains 192 grid cells of 250m a side which were allocated 0.6+ probability of containing a mound. Let’s explore how many mounds are outside these grid cells and which mounds were predicted and which not. Bring in mound attributes and check correlation. Does size play a role?
# Bring in mound attributes (n = 773) to aid exploration
mounddata <- read_csv("../1_Teaching/cds-spatial/data/KAZ_mdata.csv")
## Parsed with column specification:
## cols(
## MoundID = col_double(),
## Condition = col_double(),
## Robbed = col_double(),
## Height = col_double(),
## LandUse = col_character()
## )
mounds <- mounds %>%
left_join(mounddata, by = c("TRAP_Code"="MoundID"))
# Select gridcells that fall within TRAP study area
survey_grids <- st_intersection(survey_ch, cnn_grid)
plot(survey_grids)
# Check spatial overlap between 60%+ grid cells and all 773 mounds
predicted <- st_intersection(mounds, cnn_grid)
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
# Which ones areun-predicted?
`%nin%` = Negate(`%in%`)
unpredicted <- mounds[mounds$TRAP_Code%nin%predicted$TRAP_Code,]
predicted #166 unique features here
## Simple feature collection with 166 features and 11 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 352481.3 ymin: 4713055 xmax: 366872 ymax: 4729119
## projected CRS: WGS 84 / UTM zone 35N
## First 10 features:
## Notes TRAP_Code Cluster Lat Long Condition Robbed Height LandUse
## 82 Mound 4066 3 42.55737 25.30552 3 1 5.0 Pasture
## 83 Mound 4067 3 42.56229 25.31209 2 1 2.0 Pasture
## 84 Mound 4068 3 42.56229 25.31277 2 1 5.0 Pasture
## 87 Mound 4071 3 42.56499 25.30981 3 1 1.5 Pasture
## 614 Mound 3354 3 42.58316 25.27124 1 1 0.5 Pasture
## 619 Mound 3406 3 42.58347 25.27130 3 1 2.0 Pasture
## 620 Mound 3407 3 42.58296 25.27126 2 1 1.0 Pasture
## 621 Mound 3408 3 42.58282 25.27120 2 1 2.0 Pasture
## 631 Mound 3701 3 42.58326 25.26997 1 0 5.0 Pasture
## 632 Mound 3702 3 42.58336 25.27074 2 1 4.0 Pasture
## X1 mound_probability geometry
## 82 4748 0.8855298 POINT (360896.8 4713055)
## 83 5499 0.7194230 POINT (361446.8 4713591)
## 84 5499 0.7194230 POINT (361503 4713589)
## 87 5315 0.6406531 POINT (361265.5 4713894)
## 614 14149 0.6249947 POINT (358140.6 4715976)
## 619 14149 0.6249947 POINT (358146.9 4716009)
## 620 14149 0.6249947 POINT (358141.8 4715954)
## 621 14149 0.6249947 POINT (358136.6 4715938)
## 631 14149 0.6249947 POINT (358037 4715989)
## 632 14149 0.6249947 POINT (358100.5 4715999)
unique(predicted$TRAP_Code) # 144 unique mounds
## [1] 4066 4067 4068 4071 3354 3406 3407 3408 3701 3702 3703 4019 4020 4008 4009
## [16] 4002 4024 4011 4006 4007 3049 3050 3051 4086 4087 3275 3276 5001 3340 3531
## [31] 3329 3532 3328 3341 3690 3691 3692 3693 3694 3695 3696 3102 3103 3101 3678
## [46] 3679 3663 2086 3445 3444 3443 3440 3439 3446 2092 3447 3442 2091 2093 3293
## [61] 3073 3673 3674 3675 3676 3677 3600 3602 3603 3604 3641 3640 3642 3643 3644
## [76] 3645 3646 3647 3648 3649 3650 3652 3651 3662 3664 3665 3666 3667 3668 3669
## [91] 2144 3537 3538 3539 2134 2135 2136 2137 2141 2142 2143 3312 3517 3707 3709
## [106] 3711 3712 3713 3714 2139 2148 2150 2151 2152 2153 2154 2175 3681 3113 1067
## [121] 2184 2192 2193 2194 2196 2186 2187 4108 3621 3625 3627 3141 3140 3616 3617
## [136] 3618 3619 3620 3626 3628 3128 3632 3633 3203 1034 1035
unique(cnn_grid$X1) #332 unique cells
## [1] 14800 13653 4 5 191 1687 2437 6480 13660 14812 13478 1886
## [13] 14 3159 202 14261 14817 12548 14263 14818 14238 14460 5581 7517
## [25] 2255 9762 5956 975 2284 8832 6964 968 420 4908 4909 7999
## [37] 426 9775 8096 2088 807 11093 11280 12028 6420 5980 10535 11657
## [49] 5056 13381 11658 2125 14680 6052 11475 4184 7175 7362 3998 4372
## [61] 6055 9982 4374 4561 4748 5122 5870 6057 11106 10432 3628 4002
## [73] 4188 4562 4749 4750 4936 4937 5123 5497 5871 6058 8302 13538
## [85] 1368 3816 4004 4565 4751 5313 5499 639 4005 4192 4379 4566
## [97] 4753 4007 4194 4754 4941 5315 5502 4382 7849 7852 12847 1955
## [109] 3968 7668 8038 8778 3266 10933 2119 10373 3082 7758 1569 2864
## [121] 4715 5639 14149 5087 4583 3608 7123 10638 6200 466 651 652
## [133] 1392 3981 6202 4774 5336 653 838 3654 3655 3841 3842 4215
## [145] 4216 5337 7394 285 1579 1580 2135 4910 5095 5280 5465 3656
## [157] 4591 5339 842 1027 1211 1212 1396 1397 4541 5651 3657 3844
## [169] 3845 4031 4032 4405 4966 4967 5154 1398 1583 4033 4968 5343
## [181] 3474 10767 2696 3849 4037 5345 5346 12873 4225 10210 3853 3256
## [193] 9362 13061 1984 3106 3481 4976 5163 5350 5537 5724 5911 9465
## [205] 10774 8993 12693 14173 8809 13620 14174 2336 12882 13437 4928 309
## [217] 3487 7414 7789 4190 5115 871 5857 873 14181 14551 1994 6295
## [229] 11345 863 14183 2183 6297 6672 13448 5928 2904 4940 5310 6308
## [241] 6609 11604 1246 3098 1990 3099 5874 6060 6614 6246 6247 6431
## [253] 6250 9579 14206 6993 2000 7920 8105 9029 9584 13839 13840 9316
## [265] 5886 7922 13842 10439 7923 8293 10513 4410 8480 8664 8849 9035
## [277] 10884 11439 4411 7741 8112 8482 8666 8667 12315 8113 12316 7930
## [289] 8670 10889 14775 9412 10891 13112 13851 14037 8118 10523 14752 13300
## [301] 7753 8123 4472 4473 724 14784 726 10717 3728 4289 4475 7943
## [313] 3355 3916 4290 4801 4987 14237 3359 9717 15327 2028 3138 4803
## [325] 13272 920 4806 5177 8877 13501 5733 5549
# 146 unique mounds (of 166 intersecting) appear in the 332 unique (of 380) grids
grids_n_mounds <- predicted %>%
# st_drop_geometry() %>%
group_by(X1) %>%
count()
hist(grids_n_mounds$n, breaks = 40)
grids_n_mounds %>%
filter(n>2)
## Simple feature collection with 16 features and 2 fields
## geometry type: MULTIPOINT
## dimension: XY
## bbox: xmin: 352481.3 ymin: 4715938 xmax: 358168.9 ymax: 4727092
## projected CRS: WGS 84 / UTM zone 35N
## First 10 features:
## X1 n geometry
## 1 6993 7 MULTIPOINT ((352481.3 47247...
## 2 7920 6 MULTIPOINT ((353087.2 47250...
## 3 7922 29 MULTIPOINT ((353041.2 47252...
## 4 7923 4 MULTIPOINT ((353034.3 47255...
## 5 8105 6 MULTIPOINT ((353087.2 47250...
## 6 8293 15 MULTIPOINT ((353277 4725598...
## 7 8480 6 MULTIPOINT ((353282.8 47256...
## 8 8482 4 MULTIPOINT ((353488.9 47259...
## 9 8849 3 MULTIPOINT ((353855.3 47256...
## 10 9029 11 MULTIPOINT ((353819.8 47249...
# Gridcells contain between 0 and 29 mounds, which makes sense for the necropolis of little 10m diameter mounds in the NW
# See the predicted mounds over grids with 60% likelihood of mound plus unpredicted mounds
library(mapview)
mapview(unpredicted, color = "orange", alpha= 0.5,
map.types = c("Esri.WorldImagery", "CartoDB.Positron"))+
mapview(grids_n_mounds, zcol = "n", at = c(1, 2, 5, 10, 15, 30),
layer.name = " Predicted mounds per grid")+
survey_grids
Look at the NW cluster and see that many mounds sit on the border of polygons, and so get counted twice > ca 20 mounds, so the numbers of predicted are slightly exaggerated In NE we are missing a lot of very large 20m diameter+ mounds that are covered with scrub (look forested) In the south a lot of forest patches and beach boundaries are getting picked up that are false positives All in all the model seems to pick on the bright signatures rather than round shapes and there is an odd combination of large forested lozenges and small bare spots picked up, but somehow round forested images with little excavated (bright) trenches on the south side is not getting picked up.
Let’s see what other factors impact detectability in the CNN
# Add the information on prediction to mound dataset
mounds <- mounds %>%
mutate(predicted60 = case_when(TRAP_Code %in% predicted$TRAP_Code ~ "yes",
TRAP_Code %nin% predicted$TRAP_Code ~ "no"
))
# Look at the predicted and un=predicted mounds
unpredicted <- mounds[mounds$TRAP_Code%nin%predicted$TRAP_Code,]
plot(unpredicted$geometry, col = "red", main = "Mounds predicted (green) and unpredicted (red) by CNN");plot(predicted$geometry, col = "darkgreen", add= TRUE);
# filter the mounds for largesh and noticeable ones (2+ meters)
large <- mounds %>%
filter(Height>=2) # 247 obs
mounds %>%
group_by(predicted60) %>%
summarize(min = min(Height), max = max(Height))
## `summarise()` ungrouping output (override with `.groups` argument)
## Simple feature collection with 2 features and 3 fields
## geometry type: MULTIPOINT
## dimension: XY
## bbox: xmin: 352481.3 ymin: 4712325 xmax: 371282.4 ymax: 4730029
## projected CRS: WGS 84 / UTM zone 35N
## # A tibble: 2 x 4
## predicted60 min max geometry
## <chr> <dbl> <dbl> <MULTIPOINT [m]>
## 1 no 0 20 ((352505.2 4725060), (352519.3 4725007), (352536.4 47~
## 2 yes 0.1 10 ((352481.3 4724776), (352488.8 4724747), (352489.3 47~
# Check the height difference between un- and predicted mounds
boxplot(Height~predicted60, data = mounds)
# Check height distribution btw un- and predicted mounds
hist(mounds$Height[mounds$predicted60 == "no"], col = "pink", breaks = 20);
hist(mounds$Height[mounds$predicted60 == "yes"], col = "yellow", add =TRUE, alpha = 0.5)
T-test on height
# Is there a significant relationship between the Height of a mound and its detection?
t.test(Height ~ as.factor(predicted60), data = mounds)
##
## Welch Two Sample t-test
##
## data: Height by as.factor(predicted60)
## t = 4.2302, df = 289.8, p-value = 3.136e-05
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## 0.3825377 1.0482416
## sample estimates:
## mean in group no mean in group yes
## 1.877033 1.161644
str(t.test(Height ~ as.factor(predicted60), data = mounds))
## List of 10
## $ statistic : Named num 4.23
## ..- attr(*, "names")= chr "t"
## $ parameter : Named num 290
## ..- attr(*, "names")= chr "df"
## $ p.value : num 3.14e-05
## $ conf.int : num [1:2] 0.383 1.048
## ..- attr(*, "conf.level")= num 0.95
## $ estimate : Named num [1:2] 1.88 1.16
## ..- attr(*, "names")= chr [1:2] "mean in group no" "mean in group yes"
## $ null.value : Named num 0
## ..- attr(*, "names")= chr "difference in means between group no and group yes"
## $ stderr : num 0.169
## $ alternative: chr "two.sided"
## $ method : chr "Welch Two Sample t-test"
## $ data.name : chr "Height by as.factor(predicted60)"
## - attr(*, "class")= chr "htest"
# ... the answer seems to be yes with p at 3.14e-05
par(mfrow = c(1,2))
# Look at the predicted and un=predicted mounds
plotRGB(kaz, stretch= "lin", main = "Predicted vs unpredicted mounds");
plot(mounds$geometry[mounds$predicted60 == "no"], add = TRUE, col = "red")
plot(mounds$geometry[mounds$predicted60 == "yes"], add = TRUE, col = "green")
plot(cnn_grid$geometry, add = TRUE, border = "white")
legend(x = "bottom", # Position
legend = c("undetected", "detected",
"grids with 60%+ mound probability"), # Legend texts
pch = c(1,1,0), # Symbol shapes
col = c("red","green", "lightgrey")) # Line colors
# View Large mounds only
plotRGB(kaz, stretch= "lin", main = "Un/Predicted mounds of 2m+ height");
plot(mounds$geometry[mounds$predicted60 == "no" & mounds$Height >=2], add = TRUE, col = "red")
plot(mounds$geometry[mounds$predicted60 == "yes"& mounds$Height >=2], add = TRUE, col = "green")
plot(cnn_grid$geometry, add = TRUE, border = "white")
legend(x = "bottom", # Position
legend = c("unpredicted 2m+ mounds", "predicted 2m+ mounds",
"grids with 60%+ mound probability"), # Legend texts
pch = c(1,1,0), # Symbol shapes
col = c("red","green", "lightgrey")) # Line colors
# See the predicted 146/166 mound features of all sizes
cnn_grid %>%
# filter(mound_probability>0.5) %>%
ggplot()+
geom_sf(aes(color = mound_probability))+
scale_color_gradient(low = "green",
high = "red",
"Mound probability")+
geom_sf(data = mounds$geometry[mounds$predicted60 == "yes"], size = mounds$Height[mounds$predicted60 == "yes"], col = "lightgrey", alpha = 0.5) +
ggtitle("Predicted mounds in Kazanlak")
# See the unpredicted 627 mounds
cnn_grid %>%
# filter(mound_probability>0.5) %>%
ggplot()+
geom_sf(aes(color = mound_probability))+
scale_color_gradient(low = "green",
high = "red",
"Mound probability")+
#geom_sf(data = unpredicted$geometry)+
geom_sf(data = mounds$geometry[mounds$predicted60 == "no"], size = mounds$Height[mounds$predicted60 == "no"], col = "lightgrey", alpha = 0.5)
ggtitle("unpredicted mounds in Kazanlak")
## $title
## [1] "unpredicted mounds in Kazanlak"
##
## attr(,"class")
## [1] "labels"
Differentiate predictions by other criteria - perhaps RS dataset visibility in 2001 IKONOS?
Create a segregated density image of mound presence?
Biggest ommisions (false negatives) appear in NE where large mounds are present and biggest overcommits (false positives) are south of the reservoir, where there are morphological features but few mounds.